Landsat-8 Cloud Statistics¶

In [1]:
# Import Data Cube Utilities
import datacube
import sys, os
os.environ['USE_PYGEOS'] = '0'
from dea_tools.plotting import rgb, display_map
from dea_tools.bandindices import calculate_indices

### EASI tools
sys.path.append(os.path.expanduser('../scripts'))
from ceos_utils.data_cube_utilities.clean_mask import landsat_clean_mask_invalid, landsat_qa_clean_mask
from easi_tools import EasiDefaults
from easi_tools import notebook_utils
easi = EasiDefaults() # Get the default parameters for this system

# Import Common Utilities
from functools import partial
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import pandas as pd
Successfully found configuration for deployment "eail"
In [2]:
cluster, client = notebook_utils.initialize_dask(use_gateway=False)
display(cluster if cluster else client)
print(notebook_utils.localcluster_dashboard(client, server=easi.hub))

LocalCluster

5911d938

Dashboard: http://127.0.0.1:8787/status Workers: 4
Total threads: 8 Total memory: 29.00 GiB
Status: running Using processes: True

Scheduler Info

Scheduler

Scheduler-1f0979a2-b391-4c98-8709-c254ea225466

Comm: tcp://127.0.0.1:42963 Workers: 4
Dashboard: http://127.0.0.1:8787/status Total threads: 8
Started: Just now Total memory: 29.00 GiB

Workers

Worker: 0

Comm: tcp://127.0.0.1:38511 Total threads: 2
Dashboard: http://127.0.0.1:35211/status Memory: 7.25 GiB
Nanny: tcp://127.0.0.1:43783
Local directory: /tmp/dask-worker-space/worker-34oce967

Worker: 1

Comm: tcp://127.0.0.1:36783 Total threads: 2
Dashboard: http://127.0.0.1:40169/status Memory: 7.25 GiB
Nanny: tcp://127.0.0.1:43167
Local directory: /tmp/dask-worker-space/worker-w4xminv9

Worker: 2

Comm: tcp://127.0.0.1:36193 Total threads: 2
Dashboard: http://127.0.0.1:38873/status Memory: 7.25 GiB
Nanny: tcp://127.0.0.1:38327
Local directory: /tmp/dask-worker-space/worker-bi5xr7vb

Worker: 3

Comm: tcp://127.0.0.1:33987 Total threads: 2
Dashboard: http://127.0.0.1:41977/status Memory: 7.25 GiB
Nanny: tcp://127.0.0.1:38279
Local directory: /tmp/dask-worker-space/worker-j44blu3e
https://hub.eail.easi-eo.solutions/user/jhodge/proxy/8787/status
In [3]:
from datacube.utils.aws import configure_s3_access
configure_s3_access(aws_unsigned=False, requester_pays=True, client=client)
Out[3]:
<botocore.credentials.Credentials at 0x7f2979c309d0>
In [4]:
# Define the Product
product = "landsat8_c2l2_sr"
In [5]:
# Select a Latitude-Longitude point for the center of the analysis region
# Select the size of the box (in degrees) surrounding the center point

# Mombasa, Kenya
# lat_long = (-4.03, 39.62)
# box_size_deg = 0.15

# Calculates the latitude and longitude bounds of the analysis box
# latitude = (lat_long[0]-box_size_deg/2, lat_long[0]+box_size_deg/2)
# longitude = (lat_long[1]-box_size_deg/2, lat_long[1]+box_size_deg/2)

# Sydney Cricket Ground
# latitude = (-33.8951, -33.8902)
# longitude = (151.2219, 151.2276)

# Suva, Fiji
# latitude = (-18.1725, -18.0492) 
# longitude = (178.3881, 178.5190) 

# Bua Bay, Fiji
# latitude = (-17.0069, -16.7447) 
# longitude = (178.4270, 178.6750) 

latitude = easi.latitude
longitude = easi.longitude
In [6]:
# Select a time range
# The inputs require a format (Min,Max) using this date format (YYYY-MM-DD)
# The Landsat-8 allowable time range is: 2013-04-07 to current
time_extents = ('2021-01-01', '2021-06-01')
In [7]:
# The code below renders a map that can be used to view the region.
# It is possible to find new regions using the map below. 
# Use your mouse to zoom in/out to explore new regions
# Click on the map to view Lat-Lon coordinates of any location that could define the region boundary

display_map(longitude, latitude)
/env/lib/python3.10/site-packages/dea_tools/plotting.py:313: FutureWarning: This function is deprecated. See: https://pyproj4.github.io/pyproj/stable/gotchas.html#upgrading-to-pyproj-2-from-pyproj-1
  all_longitude, all_latitude = transform(Proj(crs), Proj('EPSG:4326'), all_x,
Out[7]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Calculate cloud coverage percentage for each pixel¶

In [8]:
# Create a custom cloud coverage table here
def build_cloud_coverage_table_landsat(product,platform,collection,level,
                                       latitude,longitude,time=None,dc=None,
                                       extra_band='green',extra_load_params={}):
    
    dc = dc if dc is not None else datacube.Datacube(app="")

    load_params = dict(product=product,
                       latitude=latitude,
                       longitude=longitude,
                       measurements=[extra_band, 'pixel_qa'],
                       **extra_load_params)

    if time is not None:
        load_params["time"] = time

    landsat_dataset = dc.load(**load_params).persist()
    
    dates = [dt.astype('datetime64[D]') for dt in landsat_dataset.time.values]
        
    clean_masks = landsat_qa_clean_mask(landsat_dataset, platform=platform,
                                        collection=collection, level=level) & \
                  landsat_clean_mask_invalid(landsat_dataset, platform, collection, level)
    clean_percent = [clean_mask.mean()*100 for clean_mask in clean_masks.values]
    clean_count = list(map(np.sum, clean_masks.values))
    
    nodata_masks = xr.full_like(clean_masks, False)
    band_nodata_values = dc.list_measurements().loc[product, 'nodata']
    if band_nodata_values is not None:
        for data_var in landsat_dataset.values():
            band_nodata_masks = data_var == data_var.attrs['nodata']
            nodata_masks = nodata_masks | band_nodata_masks
    nodata_percent = [nodata_mask.mean()*100 for nodata_mask in nodata_masks.values]
    
    cloud_masks = (~clean_masks & ~nodata_masks)
    cloud_percent = [cloud_mask.mean()*100 for cloud_mask in cloud_masks.values]
            
    total_count = list(map(np.sum, ~nodata_masks.values))

    data = {"Date": dates,"Clean_percent": clean_percent,"Cloud_percent": cloud_percent,
            "NoData_percent": nodata_percent,"Clean_count": clean_count,"Total_count": total_count}

    return (landsat_dataset,
            pd.DataFrame(data=data, columns=list(data.keys())),
            cloud_masks, nodata_masks, clean_masks)
In [9]:
dc = datacube.Datacube()
In [10]:
# Load the metadata for the given region and time period
landsat_dataset, coverage_table = build_cloud_coverage_table_landsat(dc=dc,
                                                                     product=product,
                                                                     platform='LANDSAT_8',
                                                                     collection='c2',
                                                                     level='l2',
                                                                     latitude=latitude,
                                                                     longitude=longitude,
                                                                     time=time_extents,
                                                                     extra_band='green',
                                                                     extra_load_params={
                                                                         'output_crs':'EPSG:6933',
                                                                         'resolution': (-30,30),
                                                                         'skip_broken_datasets': True,
                                                                         'dask_chunks': {'time':1},
                                                                         'group_by': 'solar_day'
                                                                     }
                                                                    )[0:2]

Create a table of cloud coverage percentage for each date¶

This table displays data for each time slice in the cube (starting at an index=0). The "clean percent" is the percent of pixels WITHOUT clouds. So, low numbers are cloudy scenes and high numbers are clearer scenes. The "Clean_count" is the number of clear pixels in the scene and the "Total_count" is the total number of pixels in the scene.

Typically, there is a separation of 16 days between Landsat-8 views for a single location. When there is significant cloud cover, scenes may be missing from time series due to issues with processing and geolocation.

In [11]:
pd.set_option('display.max_rows', len(coverage_table))
coverage_table
Out[11]:
Date Clean_percent Cloud_percent NoData_percent Clean_count Total_count
0 2021-01-05 1.854273 97.636301 0.509426 2042 109563
1 2021-01-14 0.000000 100.000000 0.000000 0 110124
2 2021-01-21 0.000000 100.000000 0.000000 0 110124
3 2021-01-30 99.522357 0.477643 0.000000 109598 110124
4 2021-02-06 99.436998 0.563002 0.000000 109504 110124
5 2021-03-03 99.992735 0.007265 0.000000 110116 110124
6 2021-03-10 99.591370 0.408630 0.000000 109674 110124
7 2021-03-26 63.416694 36.583306 0.000000 69837 110124
8 2021-04-04 99.929171 0.070829 0.000000 110046 110124
9 2021-04-11 80.254077 19.745923 0.000000 88379 110124
10 2021-04-20 9.891577 90.108423 0.000000 10893 110124
11 2021-04-27 99.957321 0.042679 0.000000 110077 110124
12 2021-05-06 99.999092 0.000908 0.000000 110123 110124
13 2021-05-13 99.854709 0.145291 0.000000 109964 110124
14 2021-05-22 0.000000 100.000000 0.000000 0 110124

Create a plot of cloud coverage percentage for each date¶

The y-axis is the "clean percentage" from the table above.

In [12]:
plt.figure(figsize = (15,5))
plt.plot(coverage_table["Date"].values, coverage_table["Clean_percent"].values, 'bo', markersize=8)
plt.ylim([0, 105])
plt.show()

Review an RGB scene for a selected time slice¶

In [13]:
# Load the data to create an RGB image
landsat_dataset = dc.load(like=landsat_dataset,
                          product=product,measurements=['red', 'green', 'blue', 'nir', 'swir1', 'swir2'],
                          dask_chunks={'time':1}) 
In [14]:
# Select one of the time slices and create an output image. 

# Time slices are numbered from 0 to x and shown in the summary table above
# Review the clean_percentage values to select scenes with few clouds
# Clouds will be visible in WHITE for an output image

slice = 4
In [15]:
# Collection 2 needs scale and offset applied for better color rendering
scale = lambda da: (da*0.0000275)-0.2
In [16]:
# Define the RGB image parameters
# True-Color = red, green, blue (this is the common true-color RGB image)
# False Color = swir2, nir, green (this is commonly used for Landsat data viewing)

true_rgb = landsat_dataset.isel(time=slice)[['red', 'green', 'blue']].map(scale).to_array()
false_rgb = landsat_dataset.isel(time=slice)[['swir2', 'nir', 'green']].map(scale).to_array()

# Define the plot settings and show the plots
# Users may want to alter the figure sizes or plot titles
# The "vmax" value controls the brightness of the images and can be adjusted 

fig, ax = plt.subplots(1, 2, figsize=(16, 8))
true_rgb.plot.imshow(ax=ax[0], vmin=0, vmax=0.2)
false_rgb.plot.imshow(ax=ax[1], vmin=0, vmax=0.5)
ax[0].set_title('True Color'), ax[0].xaxis.set_visible(False), ax[0].yaxis.set_visible(False)
ax[1].set_title('False Color'), ax[1].xaxis.set_visible(False), ax[1].yaxis.set_visible(False)
plt.show()
In [ ]: